# Imports
import gzip
import os
from helper import *
import pandas as pd
import numpy as np
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
import nltk
from nltk.probability import FreqDist
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import collections
import plotly
First we will start by investigating similar files that only differs in their names by 'raw' or 'token' in order to see the difference. We will first start by label2answer files.
# Read label2answer files, raw and token
raw_label2answer = read_data(PATH + 'InsuranceQA.label2answer.raw.encoded.gz', "label2answer")
token_label2answer = read_data(PATH + 'InsuranceQA.label2answer.token.encoded.gz', "label2answer")
These files contains two columns Answer labels and the answer text i.e. every answer is assigned to a number.
Answers here are a list of indexes, each one of them is a key in the vocabulary dictionary. In the helper.py we defined functions that help us manipulate the indexes and map them to words.
# Select an example from the raw_label2answer data
' '.join(convert_from_idx_str(raw_label2answer[100][1]))
# Select an example from the token_label2answer data
' '.join(convert_from_idx_str(token_label2answer[100][1]))
As you can see from above the 'raw' contains the data as it's entered by the users! The 'token' one is the data processed(cleaned, word separated from punctuation) ready for vectorization i.e. ready for embedding.
For visualisation and better manipulation of the data we will use pandas dataframes for our analysis.
# dataFrame that contains the answer_labes and the answer_text
# We will consider the data in the 'token' files
l2a = pd.DataFrame(read_data(PATH + 'InsuranceQA.label2answer.token.encoded.gz', "label2answer"),
columns = ['answer_label', 'answer_idx'])
# to have every thing in the same dataFrame we will append the answer text to the dataframe
l2a['answer_text'] = l2a['answer_idx'].apply(lambda a: convert_from_idx_str(a))
l2a.head()
In the helper.py file we defined a function that convert a list of indexes to the sentence associated to it (from vocabulary dictionary).
Stats and visualization
print("The number of answers is ", len(l2a))
# Adding the length of the questions
l2a['answer_length'] = l2a['answer_text'].apply(lambda q: len(q))
l2a.head()
# Description of the answers_length attribute
l2a[['answer_length']].describe()
The mean of the answers length is 111.8 word! and the mean is 16, this means that the size of the answers we have is quite long. Let's investigate more it's distribution.
# The distribution of the length of answers
l2a['answer_length'].iplot(kind='scatter', xTitle = 'answer', yTitle = 'length'
, title='Answers length scatter plot')
From the scatter plot above it seems to be that the length of the answer is quite high. This may be due to the nature of our dataset, the answers about insurance need to be precise and well formulated in order to explain better to clients. This also can be due to the fact that collaborators are using a lot of formal expressions since they are talking to clients.
# The distribution of the length of answers
l2a['answer_length'].iplot(kind='hist', xTitle='length',histnorm = 'density',
yTitle='count', title='Answers length Distribution',linecolor = 'black')
The distribution is skewed to the right! This means that the mean length is bigger than the median length. Besides, the data points on the right side of the distribution are 'invisible' which tells us that they can be outliers! Let's use a box plot to see the 'suspectedoutliers' points.
# Box plot of the length of answres with suspectedoutliers as argument
l2a[['answer_length']].iplot(kind='box', mode='lines', boxpoints = 'suspectedoutliers')
Here from the size of the box plot we can see that we have outliers! So let's have a closer look.
# let's look at the questions with lenght > 1000
list(l2a[l2a['answer_length'] > 1000]['answer_text'].apply(lambda q: ' '.join(q)))[0]
As you can see from the example above, the insurance responses seems to be so technical sometimes so in order to make sure that the information is well transmitted to clients, collaborators uses examples and create scenarios.
# Let's take an answer with small length
list(l2a[l2a['answer_length'] < 40]['answer_text'].apply(lambda q: ' '.join(q)))[0]
Other questions are straight forward so the answer is relatively short!
However, the answer is always related to the question! So if the answer is too long it's because the question covered an important number of subjects. So let's look at the questions data.
# Reading the questions files, raw and token
raw_label2question = read_data(PATH + 'InsuranceQA.question.anslabel.raw.encoded.gz',"question.anslabel")
token_label2question = read_data(PATH + 'InsuranceQA.question.anslabel.token.encoded.gz',"question.anslabel")
raw_label2question[0]
Here the lable2question data contains in the first the position the domain of the asked question, followed by a list of indexes (again keys of words in vocabulary dictionary) and a list of ground truths which they are labels for answers. This means that a questions can have multiple answers.
' '.join(convert_from_idx_str(raw_label2question[0][1]))
' '.join(convert_from_idx_str(token_label2question[0][1]))
The same thing goes for label2question files, the 'raw' files are the questions as entered by clients and the 'token' are the one processed for machine learning purposes.
# Construct dataFrame for raw_label2question
l2q = pd.DataFrame(read_data(PATH + 'InsuranceQA.question.anslabel.token.encoded.gz',"question.anslabel"),
columns = ['domain', 'questions_idx', 'groundTruth_labels'])
l2q.head()
To have a better visualization of the data we will append the plain text associated to indexes.
# Add the questions plain text
l2q['questions_text'] = l2q['questions_idx'].apply(lambda q: ' '.join(convert_from_idx_str(q)))
l2q.head()
Next, we will handle groundTruth_labels with multiple values. We will explode those rows and we will duplicate the values of the remaining ones.
def split_data_frame_list(df, target_column):
"""
Splits a column with lists into rows
arguments:
df--dataframe
target_column --name of column that contains lists
"""
# create a new dataframe with each item in a seperate column, dropping rows with missing values
col_df = pd.DataFrame(df[target_column].dropna().tolist(),index=df[target_column].dropna().index)
# create a series with columns stacked as rows
stacked = col_df.stack()
return pd.DataFrame(stacked, columns=[target_column])
# Explode the raws that contain more than one element in the groundTruth_labels list
df_groundTruth = split_data_frame_list(l2q, 'groundTruth_labels').reset_index().drop('level_1', axis = 1)
df_groundTruth.head()
# Merge the df_groundTruth withe l2q dataframe
l2q = df_groundTruth.merge(l2q, left_on = 'level_0', right_index = True).drop(['level_0'], axis = 1)
l2q.head()
Now every all possible answer to a question are considered as different question answer tuple.
# Add the question plain text to the dataframe
l2q['groundTruth_text'] = l2q['groundTruth_labels_x'].apply(lambda t:
' '.join(convert_from_idx_str(token_label2answer[int(t)-1][1])))
# Reorder the columns for better visualization
l2q = l2q[['domain', 'questions_idx', 'groundTruth_labels_y', 'groundTruth_labels_x',
'questions_text', 'groundTruth_text']]
l2q.head()
Stats and visualization
print("Number of questions is", len(l2q.groupby('questions_text').count()))
print("Number of answers used is", len(l2q))
The number of answer from the label2answer is 27413 and the number of answers in label2question is 27987. This means that some answers are used multiple times, they answer multiple questions at the same time. Let's see an example of these answers.
# Since our dataframe contains tuples of questions and answer we need to groupby the groundTruth to find
# duplicated answers
duplicated_answers = l2q.groupby('groundTruth_text').count()[['domain']].rename(columns = {'domain': 'count'})
# Keep only duplicated raws
duplicated_answers = duplicated_answers[duplicated_answers['count']>1]
print("The number of duplicated answers is", len(duplicated_answers))
# The distribution of the length of answers
duplicated_answers['count'].iplot(kind='hist', xTitle='number of duplications',
yTitle='count', title='distribution of duplication',linecolor = 'black')
The number of answers duplicated twice represents the majority by 485 from 520. However, there is 15 answer duplicated more than 4 times! Let's look at an example.
# Select rows duplicated more than 4 times
duplicated_answers = duplicated_answers[duplicated_answers['count'] > 4]
duplicated_answers.head()
Let's investigate questions associated to these answers
duplicated_answers.merge(l2q, left_index = True, right_on = 'groundTruth_text')[['domain', 'questions_text',
'groundTruth_text']]
As you can see above the questions are 'similar' so the answer duplication is a 'normal' behavior .
How Much Is It To Get Health Insurance ? is the same as 'How Much Does A Good Health Insurance Cost ?
so
Our data contains the domain field, so let's see the number of domains covered.
# Construct the domain dataFrame
domains = l2q.groupby('domain').count()[['questions_idx']].rename(columns = {'questions_idx': 'count'})
# sort count values
domains = domains.sort_values('count', ascending = False).reset_index()
domains
print("the number of domains is", len(domains))
# Pie chart of the number of domains
domains.iplot(kind='pie',labels='domain',values='count',pull=.2,hole=.2,
colorscale='blues',textposition='outside',textinfo='value+percent')
As you can observe from the pie chart the life insurance domains is the most important. It represents 36.5% of the data. Critical-illness-insurance and other-insurance represents together only 1.3 % of the data which is very small compared to the remaining domains.
Next I will see the distribution of the questions length as we did for answers length.
# Groupby questions in order to eleminate duplicated question and keep only questions
questions = l2q.groupby('questions_text').count().reset_index()[['questions_text']]
# Add the length of the questions feature
questions['questions_length'] = questions['questions_text'].apply(lambda q: len(q))
questions.head()
# The distribution of the length of questions
questions['questions_length'].iplot(kind = 'scatter', xTitle = 'question', yTitle = 'length',
mode = 'markers' , title='Questions length scatter plot')
The length of the questions is smaller than the length of answer. This is kind of normal since collaborators develop answers to make it clear for the client.
# The distribution of the length of questions
questions['questions_length'].iplot(kind='hist', xTitle='length',histnorm = 'density',
yTitle='count', title='Questions length Distribution',linecolor = 'black')
The distribution of the answers is similar to the questions distribution! Which is something we expected, long questions demand detailed answer.
raw_anslabel = read_data(PATH + 'InsuranceQA.question.anslabel.raw.100.pool.solr.test.encoded.gz', "anslabel")
token_anslabel = read_data(PATH + 'InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz', "anslabel")
These files are composed from domain, question followed by groundtruth selected from a answers in the pool. The pool size is the defined by the number of possible answers. In our case we selected pool size equal to 100.
At this level the questions and the answers are indexes(keys in vocabulary dictionary) and labels(index in label2answer). Pool includes ground_truth and also randomly selected negative answers.
# Read the file in a dataframe
raw_al = pd.DataFrame(token_anslabel, columns = ['domain', 'Questions', 'groundTruth', 'pool'])
raw_al.head()
This data is for training models, it's the same as the one constructed in label2quetion part so no further analysis are carried.
def fdist(words):
'''
This function creates the frequnce distance
arguments:
words--list of sentences
'''
# Construct the frequency distance matrix
return FreqDist(words)
# Construct the frequence distance
fdist_answer = fdist(list(itertools.chain(*l2a['answer_text'])))
# top 10 most common words
fdist_answer.most_common(10)
'insurance' appears at the 10th position, this is a very high position because it's ranked with the stopwords.
# Plot the curve of words frequency
def freq_plot(freq, samples):
'''
This function take the frequency distance matrix
and plots the most common number of samples
arguments:
freq--the frequency matrix
samples--number of sample to plot
'''
plt.figure(figsize=(15,6))
plt.title('Word frequency')
plt.xlabel('word')
freq.plot(samples, cumulative = False)
plt.show()
freq_plot(fdist_answer, 50)
The curve of the words frequency decreases exponentially.
Stopwords considered as noise in the text. Text may contain stop words such as is, am, are, this, a, an, the, etc. So let's remove them.
from nltk.tokenize import word_tokenize
import copy
# Download stop words
stop_words = set(stopwords.words("english"))
# Define a function to remove stopwords
def remove_stopwords(tokenized_sent, stop_words = list(stop_words), flatten_sentences = True):
'''
This function removes stopwords from sentences
arguments:
tokenized_sent--list of tokenized words
stop_words--list of stopwrods (here downloaded from nltk)
flatten_sentences--when it's a list of sentences flatten it
'''
filtered_sent = []
if flatten_sentences:
tokenized_sent = list(itertools.chain(*tokenized_sent))
for w in tokenized_sent:
# Check lower case since stop words are lowercased
w_l = copy.copy(w).lower()
if w_l not in stop_words:
filtered_sent.append(w)
return filtered_sent
# Create a filtred list of answers words (no stopwords)
filtered_sent_answer = remove_stopwords(list(l2a['answer_text']))
# Plot the most common words (We selected words with than 2 char to avoid punctuation)
freq_plot(fdist([l for l in filtered_sent_answer if len(l)>2]),20)
Stemming is a process of linguistic normalization, which reduces words to their word root word or chops off the derivational affixes. For example, connection, connected, connecting word reduce to a common word "connect".
This may helps us understand better the words used in our insurance data.
# Create the stemmer
ps = PorterStemmer()
def stemm(filtered_sent):
'''
This function transform the words to their word root
arguments:
filtred_sent--a list of words (in our case it's without stopwords)
'''
stemmed_words=[]
for w in filtered_sent:
stemmed_words.append(ps.stem(w))
return stemmed_words
# Create the stemmed list of words
stemmed_words_answer = stemm(filtered_sent_answer)
# Construct the frequence distance from a list of words
fdist_question = fdist(list(itertools.chain(*l2q['questions_text'].apply(lambda q: q.split(' ')))))
# top 10 most common words
fdist_question.most_common(10)
Again the insurance words is in a good position but it's too much lower than in the answers case! However we can notice that the question terms are the most common.
freq_plot(fdist_question, 50)
# Create a filtred list of answers words (no stopwords)
filtered_sent_questions = remove_stopwords(list(l2q['questions_text'].apply(lambda q: q.split(' '))))
# Plot the most common words (We selected words with than 1 char to avoid punctuation)
freq_plot(fdist([l for l in filtered_sent_questions if len(l)>1]),20)
# Answer words
answer_words = list(itertools.chain(*l2a['answer_text']))
# Question words
question_words = list(itertools.chain(*l2q['questions_text'].apply(lambda q: q.split(' '))))
# Construct answer pos tags
answer_pos_tag = nltk.pos_tag(answer_words, tagset = 'universal')
# Construct question pos tags
question_pos_tag = nltk.pos_tag(question_words, tagset = 'universal')
# Construct dataframe from tags
answer_pos_tag = pd.DataFrame(answer_pos_tag, columns = ['words', 'tag'])
question_pos_tag = pd.DataFrame(question_pos_tag, columns = ['words', 'tag'])
# pie chart of the distirbution of tags
answer_pos_tag.groupby('tag').count().sort_values('words', ascending = False).reset_index().\
iplot(kind='pie',labels='tag',values='words',pull=.1,hole=.2,
colorscale='blues',textposition='outside',textinfo='value+percent')
question_pos_tag.groupby('tag').count().sort_values('words', ascending = False).reset_index().\
iplot(kind='pie',labels='tag',values='words',pull=.2,hole=.2,
colorscale='reds',textposition='outside',textinfo='value+percent')
For both questions and answers noun and verb are the most frequent tags. However, for questions it represents 51% of the words.
from textblob import TextBlob, Word, Blobber
# Add polarity to label2question
l2q['polarity'] = l2q['questions_text'].map(lambda text: TextBlob(text).sentiment.polarity)
# Add polarity to label2answer
l2a['polarity'] = l2a['answer_text'].apply(lambda a: ' '.join(a)).\
map(lambda text: TextBlob(text).sentiment.polarity)
l2q['polarity'].iplot(kind = 'hist', bins = 50, xTitle = 'polarity',linecolor = 'black', yTitle = 'count',
title = 'Sentiment Polarity Distribution')
For the questions the polarity is concentrated around 0, this means that client does not fear asking questions! Or maybe than don't want to transmit their frustration the collaborator.
l2a['polarity'].iplot(kind = 'hist', bins = 50, xTitle = 'polarity',linecolor = 'black', yTitle = 'count',
title = 'Sentiment Polarity Distribution')
In the other hand, answers are more skewed to the positive polarity part. This is kind of good since collaborators transmit their enthusiasm.
n_topics = 6
def lsa(sentences, lsa_model):
'''
This function creates the lsa matrix, document term matrix anf tfidf vectorizer
'''
reindexed_data = sentences
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', use_idf = True, smooth_idf = True)
reindexed_data = reindexed_data.values
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
return lsa_topic_matrix, document_term_matrix,tfidf_vectorizer
def get_keys(topic_matrix):
'''
returns an integer list of predicted topic
categories for a given topic matrix
'''
keys = topic_matrix.argmax(axis = 1).tolist()
return keys
def keys_to_counts(keys):
'''
returns a tuple of topic categories and their
accompanying magnitudes for a given list of keys
'''
count_pairs = collections.Counter(keys).items()
categories = [pair[0] for pair in count_pairs]
counts = [pair[1] for pair in count_pairs]
return (categories, counts)
def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
'''
returns a list of n_topic strings, where each string contains the n most common
words in a predicted category, in order
'''
top_word_indices = []
for topic in range(n_topics):
temp_vector_sum = 0
for i in range(len(keys)):
if keys[i] == topic:
temp_vector_sum += document_term_matrix[i]
temp_vector_sum = temp_vector_sum.toarray()
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
top_word_indices.append(top_n_word_indices)
top_words = []
for topic in top_word_indices:
topic_words = []
for index in topic:
temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
temp_word_vector[:,index] = 1
the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
topic_words.append(the_word.encode('ascii').decode('utf-8'))
top_words.append(" ".join(topic_words))
return top_words
def plot_topics(nb_word, lsa_keys, document_term_matrix, tfidf_vectorizer, lsa_categories, lsa_counts, title):
'''
This function will plot a histogram of topics
'''
top_words = get_top_n_words(nb_word, lsa_keys, document_term_matrix, tfidf_vectorizer)
labels = ['Topic {}: \n'.format(i) + top_words[i] for i in lsa_categories]
fig, ax = plt.subplots(figsize = (20,8))
ax.bar(lsa_categories, lsa_counts)
ax.set_xticks(lsa_categories)
ax.set_xticklabels(labels)
ax.set_ylabel('Count')
ax.set_title(title)
plt.show()
def run_LSA(n_topics, sentences,title, nb_word_topic = 5, nb_word_plot = 3):
'''
This function will create an LSA model a plot the topic histogram
Arguments:
n_topics--number of topics
sentences--list of sentences
nb_word_topic--number of word per topic printed
nb_word_plot--number of world per topic plotted
'''
# Created the TruncatedSVD model
lsa_model = TruncatedSVD(n_components = n_topics)
# Lsa model for the answer dataframe
lsa_topic_matrix, document_term_matrix, tfidf_vectorizer = lsa(sentences, lsa_model)
# Get the keys
lsa_keys = get_keys(lsa_topic_matrix)
# Get categories along with their counts
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
# Select number of words to define the topics
top_n_words_lsa = get_top_n_words(nb_word_topic, lsa_keys, document_term_matrix, tfidf_vectorizer)
for i in range(len(top_n_words_lsa)):
print("Words characterizing topic {}: ".format(i+1), top_n_words_lsa[i])
# Visualize topics
plot_topics(nb_word_plot, lsa_keys, document_term_matrix, tfidf_vectorizer,lsa_categories,lsa_counts, title)
return lsa_topic_matrix, document_term_matrix, tfidf_vectorizer, lsa_keys
l2a['answers_text_no_stopWords'] = l2a['answer_text'].map(lambda a: ' '.join(remove_stopwords(a, flatten_sentences = False)))
lsa_topic_matrix_a, document_term_matrix_a, tfidf_vectorizer_a, lsa_keys_a = \
run_LSA(6, l2a['answers_text_no_stopWords'],'LSA answers topics')
l2q['questions_text_no_stopWords'] = l2q['questions_text'].map(lambda q: ' '.join(remove_stopwords(q.split(' '), flatten_sentences = False)))
lsa_topic_matrix_q, document_term_matrix_q, tfidf_vectorizer_q, lsa_keys_q = \
run_LSA(6, l2q['questions_text_no_stopWords'],'LSA questions topics')
In both answer and question dataframes the life insurance policy are the most mentioned topic. However, other topics are somehow related(if we plot more words per topic) but still no direct correlation.
TSNE clusters
def create_model_lsa_tsne(lsa_topic_matrix):
'''
This function will create tsne_lsa_vector
Arguments:
lsa_topic_matrix
'''
# compute tsne lsa model
tsne_lsa_model = TSNE(n_components = 2, perplexity = 50, learning_rate = 100,
n_iter = 2000, verbose = 1, random_state = 0, angle = 0.75)
# retrieve tsne lsa vectors for answers
tsne_lsa_vectors = tsne_lsa_model.fit_transform(lsa_topic_matrix)
return tsne_lsa_vectors
# Compute tsne_lsa_vectors for questions
tsne_lsa_vectors_q = create_model_lsa_tsne(lsa_topic_matrix_q)
# Compute tsne_lsa_vectors for answers
tsne_lsa_vectors_a = create_model_lsa_tsne(lsa_topic_matrix_a)
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]
def get_mean_topic_vectors(keys, two_dim_vectors):
'''
returns a list of centroid vectors from each predicted topic category
'''
mean_topic_vectors = []
for t in range(n_topics):
articles_in_that_topic = []
for i in range(len(keys)):
if keys[i] == t:
articles_in_that_topic.append(two_dim_vectors[i])
articles_in_that_topic = np.vstack(articles_in_that_topic)
mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
mean_topic_vectors.append(mean_article_in_that_topic)
return mean_topic_vectors
def cluster_bokeh_tsne(lsa_keys, document_term_matrix, tfidf_vectorizer, tsne_lsa_vectors, title ):
'''
This function will plot the cluster with bokeh library
'''
top_3_words_lsa = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
lsa_mean_topic_vectors = get_mean_topic_vectors(lsa_keys, tsne_lsa_vectors)
plot = figure(title="t-SNE "+ title + " Clustering of {} ".format(n_topics) +" LSA Topics", plot_width=700, plot_height=700)
plot.scatter(x = tsne_lsa_vectors[:,0], y = tsne_lsa_vectors[:,1], color = colormap[lsa_keys])
for t in range(n_topics):
label = Label(x = lsa_mean_topic_vectors[t][0], y = lsa_mean_topic_vectors[t][1],
text = top_3_words_lsa[t], text_color = colormap[t])
plot.add_layout(label)
show(plot)
def cluster_plotly_tsne(tsne_lsa_vectors,lsa_mean_topic_vectors,lsa_keys,\
document_term_matrix, tfidf_vectorizer, title):
'''
This function will plot the clusters with plotly library
'''
trace1 = go.Scattergl(
x = np.array(tsne_lsa_vectors[:,0]),
y = np.array(tsne_lsa_vectors[:,1]),
mode = 'markers', marker = dict(size = 5,color = colormap[lsa_keys]))
top_3_words_lsa = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
for t in range(n_topics):
trace2 = go.Scatter(
x = np.array(list(zip(*lsa_mean_topic_vectors))[0]),
y = np.array(list(zip(*lsa_mean_topic_vectors))[1]),
mode = 'markers+text',
text = np.array(top_3_words_lsa[0:n_topics]),
textfont = dict(
family = 'sans serif',
size = 15,
color = colormap[0:n_topics]))
data = [trace1,trace2]
layout = go.Layout(title = "t-SNE "+ title + " Clustering of {} ".format(n_topics) +" LSA Topics",
margin = dict(l = 0, r = 0, b = 0, t = 0))
fig = go.Figure(data = data, layout = layout)
plotly.offline.iplot(fig)
Questions plots
cluster_bokeh_tsne(lsa_keys_q, document_term_matrix_q, tfidf_vectorizer_q, tsne_lsa_vectors_q, 'Questions')
cluster_plotly_tsne(tsne_lsa_vectors_q,get_mean_topic_vectors(lsa_keys_q, tsne_lsa_vectors_q),\
lsa_keys_q, document_term_matrix_q, tfidf_vectorizer_q, 'Questions')
It's looks like that we have some clusters but nothing solid to interpret for now.
Answers plot
cluster_bokeh_tsne(lsa_keys_a, document_term_matrix_a, tfidf_vectorizer_a, tsne_lsa_vectors_a, 'Answers')
cluster_plotly_tsne(tsne_lsa_vectors_a, get_mean_topic_vectors(lsa_keys_a, tsne_lsa_vectors_a),\
lsa_keys_a, document_term_matrix_a, tfidf_vectorizer_a, 'Answers')
With tnse on top of LSA, the answers seem to have separated topics but for the questions it's not clear if we have real separation between clusters so let's try with LDA analysis.
from sklearn.decomposition import LatentDirichletAllocation
def create_model_lda(document_term_matrix):
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method='online',
random_state=0, verbose=0)
lda_topic_matrix = lda_model.fit_transform(document_term_matrix)
return lda_topic_matrix
def lda_topics(lda_topic_matrix, document_term_matrix, tfidf_vectorizer,title, \
nb_word_topic = 5, nb_word_plot = 3):
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)
top_n_words_lda = get_top_n_words(3, lda_keys, document_term_matrix, tfidf_vectorizer)
for i in range(len(top_n_words_lda)):
print("Topic {}: ".format(i+1), top_n_words_lda[i])
plot_topics(nb_word_plot, lda_keys, document_term_matrix, tfidf_vectorizer,lda_categories, lda_counts,\
title)
# Create lda topic matrix for anwers
lda_topic_matrix_a = create_model_lda(document_term_matrix_a)
# Get important terms per clusters + plots
lda_topics(lda_topic_matrix_a, document_term_matrix_a, tfidf_vectorizer_a, 'LDA Answers topics')
# Create lda topic matrix
lda_topic_matrix_q = create_model_lda(document_term_matrix_q)
# Get important terms per clusters + plots
lda_topics(lda_topic_matrix_q, document_term_matrix_q, tfidf_vectorizer_q, 'LDA Questions topics')
def create_model_lda_tsne(lda_topic_matrix):
tsne_lda_model = TSNE(n_components = 2, perplexity = 50, learning_rate = 100,
n_iter = 2000, verbose = 1, random_state = 0, angle = 0.75)
tsne_lda_vectors = tsne_lda_model.fit_transform(lda_topic_matrix)
return tsne_lda_vectors
# Create tsne vectors for answers
tsne_lda_vectors_a = create_model_lda_tsne(lda_topic_matrix_a)
# Create tsne vectors for questions
tsne_lda_vectors_q = create_model_lda_tsne(lda_topic_matrix_q)
cluster_bokeh_tsne(get_keys(lda_topic_matrix_a), document_term_matrix_a, tfidf_vectorizer_a, \
tsne_lda_vectors_a, 'Answers')
As you can see the clusters are roughly 2 but the separation is clear
cluster_bokeh_tsne(get_keys(lda_topic_matrix_q), document_term_matrix_q, tfidf_vectorizer_q, \
tsne_lda_vectors_q, 'Questions')
Again! The clusters for Answers are by better than those for the answers. However, even though LDA is supposed to be more powerful than LSA we had better clusters with LSA so for the next analysis we will stick to LSA classifications.
Save Dataframes
l2q.to_csv('l2q.csv')
l2a.to_csv('l2a.csv')
l2q['lsa_classification'] = get_keys(lsa_topic_matrix_q)
l2q['lda_classification'] = get_keys(lda_topic_matrix_q)
# Save dataframes with assignement
l2q.to_csv('l2q.csv')
l2a.to_csv('l2a.csv')
# load dataframes
l2q = pd.read_csv('l2q.csv', index_col=0)
l2a = pd.read_csv('l2a.csv', index_col=0)
# Groupby lsa clusters
domain_vs_cluster_lsa = l2q.groupby(['domain', 'lsa_classification']).count()[['questions_idx']].\
rename(columns = {'questions_idx': 'count'}).sort_values('count', ascending = False) \
.reset_index()
domain_vs_cluster_lsa.sort_values('lsa_classification', ascending = False).head()
Let's analyze each one of clusters.
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 5]
By far the auto-insurance is the best fit for cluster 5! So we can assign lsa_classification = 5 to auto-insurance
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 4]
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'annuities']
Here the disability-insurance in the most important topic 4 but annuities count is close to the best count and since most of annuities questions are in topic 4 we will say that topic 4 = disability-insurance + annuities
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 3]
# Since the retirement is relatively high compared to other domains let's how many retirements do I have
# per class
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'retirement-plans']
Here again we have an obvious result! The medicare-insurance is by far the best fit for topic 3 but we can associate with it the retirement plans so topic 3 = medicare-insurance + retirement-plans
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 2]
# Count of health-insurance domain per cluster
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'health-insurance']
# Count of home-insurance domain per cluster
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'home-insurance']
# Count of renter_insurance-insurance
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'renters-insurance']
# Count of auto-insurance domain
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'auto-insurance']
From the cluster assignment and the count of assignment per domain we can say that topic 2 = health-insurance + home-insurance + renters-insurance
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 1]
# Count of domain long-ter,m-care-insurance per cluster
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'long-term-care-insurance']
From previous cluster assignments and from values in the dataframe above we can safely say that topic is associated to long-term-care-insurance
domain_vs_cluster_lsa[domain_vs_cluster_lsa['lsa_classification'] == 0]
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'life-insurance']
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'critical-illness-insurance']
domain_vs_cluster_lsa[domain_vs_cluster_lsa['domain'] == 'other-insurance']
Also from previous assignments and from domain counts per cluster above we can associate topic 0 to life-insurance + critical-illness-insurance + other-insurance
| Cluster/topic | domain | |
|---|---|---|
| 0 | life-insurance + critical-illness-insurance + other-insurance |
|
| 1 | long-term-care-insurance |
|
| 2 | health-insurance + home-insurance + renters-insurance |
|
| 3 | medicare-insurance + retirement-plans |
|
| 4 | disability-insurance + annuities | |
| 5 | auto-insurance |
del domain_vs_cluster_lsa
del l2a, l2q
del raw_anslabel, raw_label2answer, raw_label2question
del token_anslabel, token_label2answer, token_label2question
del df_groundTruth
del questions, answer_pos_tag, answer_words
del stemmed_words_answer, fdist_answer, fdist_question
del tfidf_vectorizer_a, tfidf_vectorizer_q
del lda_topic_matrix_a, lda_topic_matrix_q, document_term_matrix_a, document_term_matrix_q
del lsa, lda_topics, lsa_keys_a, lsa_keys_q, lsa_topic_matrix_a, lsa_topic_matrix_q
del fdist